---
title: "Nationality"
author: "melissa_barales"
date: "1/15/2022"
output: pdf_document
--

```{r}
setwd("~/Desktop/Senior Thesis/Census")
```

```{r}
#loading in dataset containing both ACS files 
dat <- readRDS("Merged_ACS.rds")
```


```{r}
library(tidyverse)
library(stargazer)
```

```{r}
#Calculating proporiton of ethnicities by puma_2
sums <- dat %>% 
  group_by(puma_2, ethnicity) %>% 
  dplyr::summarize(persons=sum(perwt.x, na.rm = TRUE)) %>% 
  dplyr::mutate(prop=persons/sum(persons, na.rm = TRUE)) %>%
  dplyr::mutate(rank=min_rank(x=-prop)) %>%
  dplyr::arrange(puma_2, rank)
```

```{r}
#Subsetting pumas to those that have 30% of a given ethnicity or more
prop_over30 <- subset(sums, sums$prop>=0.30)  
```

```{r}
table(prop_over30$ethnicity,useNA = "always")
``` 

```{r}
#narrowing down to Hispanic enclaves that rank 1 & 2 and make up more 
#than 30% of the PUMA's popualtion 
Hisp_enclaves <- sums %>% filter(ethnicity=="Hispanic" & prop>= .30)
```

```{r}
write.csv(Hisp_enclaves, "~/Desktop/Senior Thesis/Census/Hisp_enclaves.csv")
```

**ENCLAVE RESIDENCY VARIABLE**

```{r}
#variable is made up of those who live in a Hispanic enclave over 30% 
dat <- dat %>% mutate(enclave_res = (ethnicity=="Hispanic" & puma_2 %in% Hisp_enclaves$puma_2))
```

```{r}
table(dat$enclave_res, useNA = "always")
```

```{r}
#Subsetting pumas to those that have 30% of a given ethnicity or more
prop_over30 <- subset(sums, sums$prop>=0.30)
```

```{r}
table(prop_over30$ethnicity,useNA = "always")
```

```{r}
#Subsetting pumas to those that have below 30% of a given ethnicity
prop_under30 <- subset(sums, sums$prop<0.30)
```


**Regression of Enclave Effect on Socio-Demographic Variables**

```{r}
#Recoding NAs of dependent variables 
#Income of Wages
dat$incwage[dat$incwage==999999 | 
                          dat$incwage==999998] <- NA
```

```{r}
summary(dat$incwage, na.rm = TRUE)
#note that the median is lower than the mean 
#it skews right
```


```{r}
#Use of Food Stamps
dat$foodstmp[dat$foodstmp==0] <- NA
```

```{r}
#Household Income 
dat$hhincome[dat$hhincome== 9999999] <- NA
#consider using family income instead
```

```{r}
summary(dat$hhincome, na.rm = TRUE)
```


```{r}
#Recoding educaiton variable to categorical variable
dat$education <- NA
```

```{r}
#HIGHEST LEVEL OF EDUCATION RECEIVED
dat$education[dat$educ==0] <- NA

dat$education[dat$educ==1 | dat$educ==2 | dat$educ==3 |  
            dat$educ==4 | dat$educ==5] <- "a - no high school degree"

dat$education[ dat$educ==6] <- "b - High school degree"

dat$education[dat$educ==7 |   dat$educ==8 |  dat$educ==9] <- "c - Some college"

dat$education[dat$educ== 10 ] <- "d - College degree"

dat$education[dat$educ== 11 ] <- "e - Postgraduate"
#number of people selecting high school and below seems low 
```

```{r}
prop.table(table(dat$education))
```

```{r}
#Converting education to factor variable 
dat$education <- as.factor(dat$education)
```

```{r}
#Employment status
dat$empstat[dat$empstat==0] <- NA
```

```{r}
prop.table(table(dat$empstat)) * 100
```

```{r}
dat$emp <- NA 
```

```{r}
dat$emp[dat$empstat==0] <- NA
```

```{r}
dat$emp[dat$empstat==1] <- 1
```

```{r}
dat$emp[dat$empstat==2 | dat$empstat ==3] <- 0
```

```{r}
table(dat$emp)
```

```{r}
#Age squared
dat$age_squared <- dat$age^2
```

```{r}
#Recoding birth place
dat$bpl <- as.numeric(dat$bpl)
```

```{r}
class(dat$bpl)
```

```{r}
#Foreign Born
dat$foreign <- NA
dat$foreign[dat$bpl <= 99] <- 0
dat$foreign[dat$bpl > 99] <- 1
```

```{r}
table(dat$foreign)
```

```{r}
#Creating Hispanic nationality variable 
```

```{r}
dat$hispnat <- NA
```

```{r}
names(dat)
```

```{r}
dat$bpl_mom_new <- as.numeric(dat$bpl_mom_new)
```

```{r}
class(dat$bpl_mom_new)
```

```{r}
#Mexico
dat$hispnat[dat$bpld==20000 | dat$bpl_mom_new==200] <- "Mexico"
```

```{r}
#Central America
dat$hispnat[dat$bpld== 21030  | dat$bpld== 21040 | dat$bpld== 21060 |
               dat$bpld== 21020 | dat$bpld== 21070 | dat$bpld== 21050 |
              dat$bpl_mom_new== 210] <- "Central America"
```

```{r}
#Cuba
dat$hispnat[dat$bpld== 25000 |  dat$bpl_mom_new==250] <- "Cuba"
```

```{r}
#South America
dat$hispnat[dat$bpld== 30005 | dat$bpld== 30010 | dat$bpld== 30020 |
              dat$bpld== 30025 | dat$bpld== 30030 | dat$bpld== 30045 |
              dat$bpld== 30050 | dat$bpld== 30060 | dat$bpld== 30065 | 
              dat$bpl_mom_new==300] <- "South America"
```


```{r}
#Puerto Rico
dat$hispnat[dat$bpld== 11000 | dat$bpl_mom_new==110] <- "Puerto Rico"
```

```{r}
#America
dat$hispnat[dat$bpld<=09900 & dat$bpl_mom_new<100] <- "aAmerican"
```

```{r}
table(dat$hispnat, useNA = "always")
```

```{r}
class(dat$hispnat)
```

```{r}
dat$hispnat <- as.factor(dat$hispnat)
```

```{r}
class(dat$hispnat)
```

```{r}
table(dat$hispnat)
```

```{r}
#creating dummy variables for only Hispanic nationalities
```

```{r}
dat$hispnat2 <- NA
dat$hispnat2[dat$hispnat=="Central America"] <- "Central America"
dat$hispnat2[dat$hispnat=="Cuba"] <- "Cuba"
dat$hispnat2[dat$hispnat=="Mexico"] <- "Mexico"
dat$hispnat2[dat$hispnat=="Puerto Rico"] <- "Puerto Rico"
dat$hispnat2[dat$hispnat=="South America"] <- "South America"
```

```{r}
table(dat$hispnat2, useNA = "always")
```


```{r}
dat$Mexican <- NA
dat$Mexican[dat$hispnat2=="Mexico"] <- 1
dat$Mexican[dat$hispnat2!="Mexico"] <- 0
```

```{r}
table(dat$Mexican)
```

```{r}
dat$Cuban <- NA
dat$Cuban[dat$hispnat2=="Cuba"] <- 1
dat$Cuban[dat$hispnat2!="Cuba"] <- 0
```

```{r}
table(dat$Cuban)
```

```{r}
dat$Puerto_Rican <- NA
dat$Puerto_Rican[dat$hispnat2=="Puerto Rico"] <- 1
dat$Puerto_Rican[dat$hispnat2!="Puerto Rico"] <- 0
```

```{r}
table(dat$Puerto_Rican)
```

```{r}
dat$Central_American <- NA
dat$Central_American[dat$hispnat2=="Central America"] <- 1
dat$Central_American[dat$hispnat2!="Central America"] <- 0
```

```{r}
table(dat$Central_American)
```

```{r}
dat$South_American <- NA
dat$South_American[dat$hispnat2=="South America"] <- 1
dat$South_American[dat$hispnat2!="South America"] <- 0
```

```{r}
table(dat$South_American)
```


**Wage and Salary Income**
```{r}
incwage2 <- lm(data = dat, incwage ~  enclave_res * age + age_squared +
                sex + density + education + foreign + hispnat + dat$statefip_name)
```

```{r}
summary(incwage2)
```

```{r echo=FALSE, results='asis'}
stargazer(incwage2, type = "text")
```


**Total Household Income**
```{r}
hincome2 <- lm(data = dat, hhincome ~  enclave_res + age + age_squared +  
               sex + density + education + foreign + hispnat + dat$statefip_name)
```

```{r}
summary(hincome2)
``` 

```{r echo=FALSE}
stargazer(hincome2, type = "text")
``` 

**Food Stamp Use**
```{r}
foodstmp2 <- lm(data = dat, foodstmp ~ enclave_res + age + age_squared +
                 sex + density + education + foreign + hispnat + dat$statefip_name) 
```

```{r}
summary(foodstmp2)
```

```{r echo=FALSE}
stargazer(foodstmp2, type="text")
```


**Employment**
```{r}
employ2 <- lm(data = dat, emp ~  enclave_res + age + age_squared + sex + density + 
               education + foreign + hispnat + dat$statefip_name)
```

```{r}
summary(employ2)
```

```{r echo=FALSE, results='asis'}
stargazer(employ2, type = "text")
```

```{r}
?stargazer
```

```{r}
stargazer(incwage2, hincome2, foodstmp2, employ2, 
          dep.var.labels = c("Salary Income", "Household Income", "Food Stamps Use", "Employment"),
          covariate.labels = c("Enclave Residency", "Age", "Age Squared", 
                               "Sex", "PUMA Density", 
                               "High School Degree", "Some College", 
                               "College Degree", 
                               "Postgraduate", "Foreign-born", "Central American", "Cuban",
                               "Mexican", "Puerto Rican", "South American", "State"), align=TRUE,
          title="Effect of Enclave Residency on Socio-demographic Outcomes", type = "text",
          out = "combined_model_hisp_1.htm")
```


** T - TEST **

```{r}
#estimating predictive value of socio-demographic variables 
#on enclave residency
```


```{r}
dat$second_generation <- NA
dat$second_generation[dat$bpl<100] <- 1
dat$second_generation[dat$bpl>=100] <- 0
```

```{r}
table(dat$second_generation)
```

```{r}
diff <- lm(data = dat, enclave_res ~ age + 
                sex + education + second_generation + Mexican + Cuban + 
             Puerto_Rican + Central_American + 
             South_American) 
```

```{r echo=FALSE, results='asis'}
stargazer(diff, type = "text", out = "new_model.htm")
```

```{r}
library(dotwhisker)
```

```{r}
dwplot(diff,  ci = 0.95,
       vline = geom_vline(
           xintercept = 0,
           colour = "grey60",
           linetype = 2
       ),
       vars_order = c("age","sex", "educationb - High school degree", 
                      "educationc - Some college", "educationd - College degree", 
                      "educatione - Postgraduate", "second_generation",
                      "Mexican", "Cuban",
                      "Puerto_Rican", "Central_American")
       ) %>% 
          relabel_predictors(age = "Age",
            sex = "Sex (Female)",
            'educationb - High school degree' = "HS Graduate",
            'educationc - Some college' = "Some College",
            'educationd - College degree' = "College Graduate",
            'educatione - Postgraduate' = "Post Graduate",
            'second_generation' = "Second Generation",
            Mexican = "Mexican",
            Cuban = "Cuban",
            Puerto_Rican = "Puerto Rican",
            Central_American = "Central American") + font_size(labels.y = 10) + 
    xlab("Likelihood of Enclave Residency") + ylab("") +
      geom_vline(xintercept = 0,
               colour = "grey60",
               linetype = 2)
```

```{r}
#unsure why South American isn't showing 
#these nationalisties as compared to other nationalities
```











